SETUP

Output dir

path <- paste("results/")
ifelse(!dir.exists(path), dir.create(path), FALSE)
[1] FALSE
savedir <- paste(path,"Probing",sep='')
ifelse(!dir.exists(savedir), dir.create(savedir), FALSE)
[1] FALSE

READ DATA

model2plausibility_dir = "../probing/results/model2plausibility"
datafile = "all_combined_kfold_new.csv"

dat = read.csv(paste(model2plausibility_dir, datafile, sep="/"))


dat <- subset(dat, select = -X)

dat$Layer = as.factor(dat$Layer)
dat$Model = factor(dat$Model, levels=c("bert-large-cased", "roberta-large", "gpt2-xl"))

STATS

dat.DTFit = dat %>% filter(Dataset=="DTFit")

m.DTFit = lm(Accuracy~Layer:Model, data=dat.DTFit)
summary(m.DTFit)

Call:
lm(formula = Accuracy ~ Layer:Model, data = dat.DTFit)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.138173 -0.027300  0.000385  0.027628  0.125481 

Coefficients: (49 not defined because of singularities)
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    0.7958654  0.0132318  60.148  < 2e-16 ***
Layer1:Modelbert-large-cased  -0.2958654  0.0187126 -15.811  < 2e-16 ***
Layer2:Modelbert-large-cased  -0.1811859  0.0187126  -9.683  < 2e-16 ***
Layer3:Modelbert-large-cased  -0.1535577  0.0187126  -8.206 8.26e-16 ***
Layer4:Modelbert-large-cased  -0.1322115  0.0187126  -7.065 3.30e-12 ***
Layer5:Modelbert-large-cased  -0.1462180  0.0187126  -7.814 1.61e-14 ***
Layer6:Modelbert-large-cased  -0.1245513  0.0187126  -6.656 4.99e-11 ***
Layer7:Modelbert-large-cased  -0.1069872  0.0187126  -5.717 1.49e-08 ***
Layer8:Modelbert-large-cased  -0.0882372  0.0187126  -4.715 2.81e-06 ***
Layer9:Modelbert-large-cased  -0.0250641  0.0187126  -1.339 0.180785    
Layer10:Modelbert-large-cased -0.0201282  0.0187126  -1.076 0.282384    
Layer11:Modelbert-large-cased -0.0289744  0.0187126  -1.548 0.121895    
Layer12:Modelbert-large-cased  0.0140064  0.0187126   0.749 0.454361    
Layer13:Modelbert-large-cased  0.0029487  0.0187126   0.158 0.874825    
Layer14:Modelbert-large-cased -0.0009936  0.0187126  -0.053 0.957667    
Layer15:Modelbert-large-cased  0.0115064  0.0187126   0.615 0.538781    
Layer16:Modelbert-large-cased  0.0355128  0.0187126   1.898 0.058055 .  
Layer17:Modelbert-large-cased  0.0389423  0.0187126   2.081 0.037721 *  
Layer18:Modelbert-large-cased  0.0417308  0.0187126   2.230 0.025998 *  
Layer19:Modelbert-large-cased  0.0213141  0.0187126   1.139 0.255008    
Layer20:Modelbert-large-cased  0.0251282  0.0187126   1.343 0.179673    
Layer21:Modelbert-large-cased  0.0190705  0.0187126   1.019 0.308428    
Layer22:Modelbert-large-cased -0.0012179  0.0187126  -0.065 0.948120    
Layer23:Modelbert-large-cased -0.0507051  0.0187126  -2.710 0.006868 ** 
Layer24:Modelbert-large-cased -0.0187821  0.0187126  -1.004 0.315798    
Layer25:Modelbert-large-cased         NA         NA      NA       NA    
Layer26:Modelbert-large-cased         NA         NA      NA       NA    
Layer27:Modelbert-large-cased         NA         NA      NA       NA    
Layer28:Modelbert-large-cased         NA         NA      NA       NA    
Layer29:Modelbert-large-cased         NA         NA      NA       NA    
Layer30:Modelbert-large-cased         NA         NA      NA       NA    
Layer31:Modelbert-large-cased         NA         NA      NA       NA    
Layer32:Modelbert-large-cased         NA         NA      NA       NA    
Layer33:Modelbert-large-cased         NA         NA      NA       NA    
Layer34:Modelbert-large-cased         NA         NA      NA       NA    
Layer35:Modelbert-large-cased         NA         NA      NA       NA    
Layer36:Modelbert-large-cased         NA         NA      NA       NA    
Layer37:Modelbert-large-cased         NA         NA      NA       NA    
Layer38:Modelbert-large-cased         NA         NA      NA       NA    
Layer39:Modelbert-large-cased         NA         NA      NA       NA    
Layer40:Modelbert-large-cased         NA         NA      NA       NA    
Layer41:Modelbert-large-cased         NA         NA      NA       NA    
Layer42:Modelbert-large-cased         NA         NA      NA       NA    
Layer43:Modelbert-large-cased         NA         NA      NA       NA    
Layer44:Modelbert-large-cased         NA         NA      NA       NA    
Layer45:Modelbert-large-cased         NA         NA      NA       NA    
Layer46:Modelbert-large-cased         NA         NA      NA       NA    
Layer47:Modelbert-large-cased         NA         NA      NA       NA    
Layer48:Modelbert-large-cased         NA         NA      NA       NA    
Layer1:Modelroberta-large     -0.2958654  0.0187126 -15.811  < 2e-16 ***
Layer2:Modelroberta-large     -0.2882051  0.0187126 -15.402  < 2e-16 ***
Layer3:Modelroberta-large     -0.1735897  0.0187126  -9.277  < 2e-16 ***
Layer4:Modelroberta-large     -0.1853205  0.0187126  -9.904  < 2e-16 ***
Layer5:Modelroberta-large     -0.1645833  0.0187126  -8.795  < 2e-16 ***
Layer6:Modelroberta-large     -0.1020513  0.0187126  -5.454 6.45e-08 ***
Layer7:Modelroberta-large     -0.1158654  0.0187126  -6.192 9.18e-10 ***
Layer8:Modelroberta-large     -0.1599679  0.0187126  -8.549  < 2e-16 ***
Layer9:Modelroberta-large     -0.0980449  0.0187126  -5.240 2.02e-07 ***
Layer10:Modelroberta-large    -0.0654808  0.0187126  -3.499 0.000490 ***
Layer11:Modelroberta-large    -0.0311539  0.0187126  -1.665 0.096303 .  
Layer12:Modelroberta-large    -0.0148077  0.0187126  -0.791 0.428973    
Layer13:Modelroberta-large     0.0317308  0.0187126   1.696 0.090305 .  
Layer14:Modelroberta-large     0.0301282  0.0187126   1.610 0.107752    
Layer15:Modelroberta-large     0.0532372  0.0187126   2.845 0.004546 ** 
Layer16:Modelroberta-large     0.0468590  0.0187126   2.504 0.012458 *  
Layer17:Modelroberta-large     0.0256090  0.0187126   1.369 0.171498    
Layer18:Modelroberta-large     0.0681731  0.0187126   3.643 0.000285 ***
Layer19:Modelroberta-large     0.0604808  0.0187126   3.232 0.001275 ** 
Layer20:Modelroberta-large     0.0466667  0.0187126   2.494 0.012822 *  
Layer21:Modelroberta-large     0.0554808  0.0187126   2.965 0.003111 ** 
Layer22:Modelroberta-large     0.0367308  0.0187126   1.963 0.049979 *  
Layer23:Modelroberta-large     0.0554808  0.0187126   2.965 0.003111 ** 
Layer24:Modelroberta-large     0.0467308  0.0187126   2.497 0.012700 *  
Layer25:Modelroberta-large            NA         NA      NA       NA    
Layer26:Modelroberta-large            NA         NA      NA       NA    
Layer27:Modelroberta-large            NA         NA      NA       NA    
Layer28:Modelroberta-large            NA         NA      NA       NA    
Layer29:Modelroberta-large            NA         NA      NA       NA    
Layer30:Modelroberta-large            NA         NA      NA       NA    
Layer31:Modelroberta-large            NA         NA      NA       NA    
Layer32:Modelroberta-large            NA         NA      NA       NA    
Layer33:Modelroberta-large            NA         NA      NA       NA    
Layer34:Modelroberta-large            NA         NA      NA       NA    
Layer35:Modelroberta-large            NA         NA      NA       NA    
Layer36:Modelroberta-large            NA         NA      NA       NA    
Layer37:Modelroberta-large            NA         NA      NA       NA    
Layer38:Modelroberta-large            NA         NA      NA       NA    
Layer39:Modelroberta-large            NA         NA      NA       NA    
Layer40:Modelroberta-large            NA         NA      NA       NA    
Layer41:Modelroberta-large            NA         NA      NA       NA    
Layer42:Modelroberta-large            NA         NA      NA       NA    
Layer43:Modelroberta-large            NA         NA      NA       NA    
Layer44:Modelroberta-large            NA         NA      NA       NA    
Layer45:Modelroberta-large            NA         NA      NA       NA    
Layer46:Modelroberta-large            NA         NA      NA       NA    
Layer47:Modelroberta-large            NA         NA      NA       NA    
Layer48:Modelroberta-large            NA         NA      NA       NA    
Layer1:Modelgpt2-xl           -0.2769551  0.0187126 -14.800  < 2e-16 ***
Layer2:Modelgpt2-xl           -0.1347115  0.0187126  -7.199 1.32e-12 ***
Layer3:Modelgpt2-xl           -0.1043269  0.0187126  -5.575 3.31e-08 ***
Layer4:Modelgpt2-xl           -0.0983974  0.0187126  -5.258 1.83e-07 ***
Layer5:Modelgpt2-xl           -0.1082692  0.0187126  -5.786 1.01e-08 ***
Layer6:Modelgpt2-xl           -0.0917308  0.0187126  -4.902 1.13e-06 ***
Layer7:Modelgpt2-xl           -0.0555449  0.0187126  -2.968 0.003077 ** 
Layer8:Modelgpt2-xl           -0.0580449  0.0187126  -3.102 0.001985 ** 
Layer9:Modelgpt2-xl           -0.0438782  0.0187126  -2.345 0.019260 *  
Layer10:Modelgpt2-xl          -0.0652564  0.0187126  -3.487 0.000513 ***
Layer11:Modelgpt2-xl          -0.0579167  0.0187126  -3.095 0.002031 ** 
Layer12:Modelgpt2-xl          -0.0326923  0.0187126  -1.747 0.080979 .  
Layer13:Modelgpt2-xl          -0.0136859  0.0187126  -0.731 0.464749    
Layer14:Modelgpt2-xl          -0.0201923  0.0187126  -1.079 0.280855    
Layer15:Modelgpt2-xl          -0.0100321  0.0187126  -0.536 0.592019    
Layer16:Modelgpt2-xl          -0.0074039  0.0187126  -0.396 0.692453    
Layer17:Modelgpt2-xl          -0.0238461  0.0187126  -1.274 0.202886    
Layer18:Modelgpt2-xl          -0.0248718  0.0187126  -1.329 0.184150    
Layer19:Modelgpt2-xl          -0.0302564  0.0187126  -1.617 0.106264    
Layer20:Modelgpt2-xl           0.0000641  0.0187126   0.003 0.997268    
Layer21:Modelgpt2-xl           0.0190064  0.0187126   1.016 0.310056    
Layer22:Modelgpt2-xl          -0.0098397  0.0187126  -0.526 0.599137    
Layer23:Modelgpt2-xl           0.0028526  0.0187126   0.152 0.878875    
Layer24:Modelgpt2-xl           0.0164423  0.0187126   0.879 0.379821    
Layer25:Modelgpt2-xl           0.0076923  0.0187126   0.411 0.681118    
Layer26:Modelgpt2-xl          -0.0035577  0.0187126  -0.190 0.849257    
Layer27:Modelgpt2-xl          -0.0261538  0.0187126  -1.398 0.162574    
Layer28:Modelgpt2-xl          -0.0148077  0.0187126  -0.791 0.428973    
Layer29:Modelgpt2-xl          -0.0087500  0.0187126  -0.468 0.640189    
Layer30:Modelgpt2-xl          -0.0313141  0.0187126  -1.673 0.094606 .  
Layer31:Modelgpt2-xl          -0.0162820  0.0187126  -0.870 0.384481    
Layer32:Modelgpt2-xl          -0.0264744  0.0187126  -1.415 0.157491    
Layer33:Modelgpt2-xl          -0.0248718  0.0187126  -1.329 0.184150    
Layer34:Modelgpt2-xl          -0.0274039  0.0187126  -1.464 0.143432    
Layer35:Modelgpt2-xl          -0.0365064  0.0187126  -1.951 0.051392 .  
Layer36:Modelgpt2-xl          -0.0074359  0.0187126  -0.397 0.691190    
Layer37:Modelgpt2-xl          -0.0149680  0.0187126  -0.800 0.423996    
Layer38:Modelgpt2-xl          -0.0303205  0.0187126  -1.620 0.105527    
Layer39:Modelgpt2-xl          -0.0438141  0.0187126  -2.341 0.019437 *  
Layer40:Modelgpt2-xl          -0.0113461  0.0187126  -0.606 0.544449    
Layer41:Modelgpt2-xl          -0.0250000  0.0187126  -1.336 0.181901    
Layer42:Modelgpt2-xl          -0.0298718  0.0187126  -1.596 0.110777    
Layer43:Modelgpt2-xl          -0.0100961  0.0187126  -0.540 0.589655    
Layer44:Modelgpt2-xl          -0.0173718  0.0187126  -0.928 0.353486    
Layer45:Modelgpt2-xl          -0.0363141  0.0187126  -1.941 0.052629 .  
Layer46:Modelgpt2-xl           0.0017308  0.0187126   0.092 0.926328    
Layer47:Modelgpt2-xl          -0.0239103  0.0187126  -1.278 0.201676    
Layer48:Modelgpt2-xl                  NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.04184 on 864 degrees of freedom
Multiple R-squared:  0.7949,    Adjusted R-squared:  0.7724 
F-statistic: 35.25 on 95 and 864 DF,  p-value: < 2.2e-16

READ HUMAN CEILING DATA

dat$Layer = as.numeric(as.character(dat$Layer))


model2plausibility_dir = "../probing/results/model2human_ceiling"

read_data <- function(directory, filename) {
  print(paste(filename))
  metadata = str_split(filename, "_")[[1]]
  d = read.delim(paste(model2plausibility_dir, filename, sep='/'), 
                 header=FALSE, sep=',')
  d = d %>%
    mutate(Dataset = metadata[[2]]) %>%
    mutate(VoiceType = metadata[[3]]) %>%
    mutate(TrialType = metadata[[4]]) %>%
    mutate(TrialType = str_replace(TrialType, ".csv", "")) %>%
    rename(Iteration = V1)  %>%
    rename(Accuracy = V2)  %>%
    mutate(Plot = "HumanCeiling")
}

filenames = list.files(path=model2plausibility_dir, pattern='*.csv')
dat.ceiling = do.call(rbind, lapply(filenames, function(x) read_data(model2plausibility_dir, x)))
[1] "bert-large-cased_DTFit_normal_normal.csv"
[1] "bert-large-cased_EventsAdapt_active-active_AAN-AAN.csv"
[1] "bert-large-cased_EventsAdapt_active-active_AAN-AAR.csv"
[1] "bert-large-cased_EventsAdapt_active-active_AAN-AI.csv"
[1] "bert-large-cased_EventsAdapt_active-active_AI-AAN.csv"
[1] "bert-large-cased_EventsAdapt_active-active_AI-AAR.csv"
[1] "bert-large-cased_EventsAdapt_active-active_AI-AI.csv"
[1] "bert-large-cased_EventsAdapt_active-active_normal-AAR.csv"
[1] "bert-large-cased_EventsAdapt_active-active_normal.csv"
[1] "bert-large-cased_EventsAdapt_active-passive_normal.csv"
[1] "bert-large-cased_EventsAdapt_normal_normal.csv"
[1] "bert-large-cased_EventsAdapt_passive-active_normal.csv"
[1] "bert-large-cased_EventsAdapt_passive-passive_normal.csv"
[1] "bert-large-cased_EventsRev_normal_normal.csv"
add_info <- function(dataframe, model_name) {
  if (grepl("gpt2-xl",model_name) == FALSE){
    toadd = 5
  }else{
    toadd = 10
  }
  dataframe = dataframe %>%
    mutate(Layer = max(subset(dat, Model==model_name)$Layer) + toadd) %>%
    mutate(Model = model_name)
}

models = unique(c(dat$Model))
  
dat.ceiling.full = do.call(rbind, lapply(models, function(x) add_info(dat.ceiling, x)))
merged <- rbind(dat, dat.ceiling.full)

PLOT

breaks function x axis

breaks_fun <- function(x) {
  if (max(x) > 40) {
    c(seq(0, 50, 10), max(x) + 10)
  } else {
    c(seq(0, 25, 5), max(x) + 5)
  }
}

All datasets

plot_data = merged %>% filter(TrialType=="normal", VoiceType=="normal",Plot!="HumanCeiling")
plot_data$Layer = as.numeric(as.character(plot_data$Layer))

ceil_data = merged %>% filter(TrialType=="normal", VoiceType=="normal",Plot=="HumanCeiling")
ceil_summary <- ceil_data %>%
  group_by(Dataset) %>%
  summarise(
    MeanAccuracy = mean(Accuracy),
    sd = sd(Accuracy),
    n = n(),
    SE = sd / sqrt(n)
  )

datasets = c(ceil_summary$Dataset)

ceil_data = ceil_data %>% 
  mutate(MeanAccuracy = NA) %>%
  mutate(SE = NA)

for( dt in datasets){
  ceil_data = ceil_data %>%
    mutate(MeanAccuracy = ifelse(ceil_data$Dataset==dt, subset(ceil_summary, Dataset == dt)$MeanAccuracy, ceil_data$MeanAccuracy)) %>%
    mutate(SE = ifelse(ceil_data$Dataset==dt, subset(ceil_summary, Dataset == dt)$SE, ceil_data$SE))
}

ggplot(data = plot_data,
       mapping = aes(x=Layer, y=Accuracy, color=Dataset, group = Dataset))+
  facet_grid(~Model, scales="free_x")+
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0.1)+
  geom_point(data = ceil_data, aes(x=Layer, y=MeanAccuracy, color=Dataset), size=0.2)+
  geom_errorbar(data = ceil_data, aes(x=Layer, ymin=MeanAccuracy-SE, ymax=MeanAccuracy+SE, color=Dataset),width = 0.05) +
  geom_text(data = ceil_data, group=ceil_data$Dataset, x =ifelse(ceil_data$Model=="gpt2-xl", ceil_data$Layer - 6, ceil_data$Layer - 3),  y = 0.98,
            size = 2.5,
            label = "ceiling", 
            colour = "#6d6d6d") +
  theme_classic()+
  scale_x_continuous(breaks=breaks_fun)+
  scale_y_continuous(breaks=seq(0.4,1.2,0.1))


savename <- "model2plausibility_all_datasets_ceiled.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm', device='tiff', dpi=700)

DTFit

dataset = "DTFit"

ggplot(data = dat %>% filter(Dataset==dataset),
       mapping = aes(x=Layer, y=Accuracy, group=dataset))+
  facet_wrap(~Plot, scales="free_x")+
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               color = 'black', size = 0.5, width=0.1)+
  theme_classic()

savename <- "model2plausibility_DTFit.png"
ggsave(paste(savedir,savename,sep="/"), width=16, height=8, units='cm')

EventsRev

dataset = "EventsRev"

ggplot(data = dat %>% filter(Dataset==dataset),
       mapping = aes(x=Layer, y=Accuracy, group=dataset))+
  facet_grid(~Plot, scales="free_x")+
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               color = 'black', size = 0.5, width=0.1)+
  theme_classic()

savename <- "model2plausibility_EventsRev.png"
ggsave(paste(savedir,savename,sep="/"), width=16, height=8, units='cm')

EventsAdapt

Active Passive

dataset = "EventsAdapt"

ggplot(data = dat %>% filter(Dataset==dataset, TrialType=="normal"),
       mapping = aes(x=Layer, y=Accuracy, color=VoiceType, group=VoiceType))+
  facet_grid(~Model, scales="free_x")+ 
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0)+
  theme_classic()

savename <- "model2plausibility_EventsAdapt_active-passive.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm')

###Attempt with human ceiling

dataset = "EventsAdapt"
plot_data = merged %>% filter(Dataset==dataset, TrialType=="normal",Plot!="HumanCeiling")

ceil_data = merged %>% filter(TrialType=="normal",Plot=="HumanCeiling")
ceil_summary <- ceil_data %>%
  group_by(VoiceType) %>%
  summarise(
    MeanAccuracy = mean(Accuracy),
    sd = sd(Accuracy),
    n = n(),
    se = sd / sqrt(n)
  )

voice_types = c(ceil_summary$VoiceType)

ceil_data = ceil_data %>%
  mutate(MeanAccuracy = NA) %>%
  mutate(SE = NA)

for( vt in voice_types){
  ceil_data = ceil_data %>%
    mutate(MeanAccuracy = ifelse(ceil_data$VoiceType==vt, subset(ceil_summary, VoiceType == vt)$MeanAccuracy, ceil_data$MeanAccuracy)) %>%
    mutate(SE = ifelse(ceil_data$VoiceType==vt, subset(ceil_summary, VoiceType == vt)$se, ceil_data$SE))
}

ggplot(data = plot_data,
       mapping = aes(x=Layer, y=Accuracy, color=VoiceType, group=VoiceType))+
  facet_grid(~Model, scales="free_x")+
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0.1)+
  geom_point(inherit.aes=FALSE, data = ceil_data, aes(x=Layer, y=MeanAccuracy, color=VoiceType), size=0.2)+
  geom_errorbar(inherit.aes=FALSE, data = ceil_data, aes(x=Layer, ymin=MeanAccuracy-SE, ymax=MeanAccuracy+SE, color=VoiceType),width = 0.05) +
    geom_text(data = ceil_data, group=ceil_data$VoiceType, x =ifelse(ceil_data$Model=="gpt2-xl", ceil_data$Layer - 6, ceil_data$Layer - 3),  y = 0.98,
            size = 2.5,
           label = "ceiling", 
           colour = "#6d6d6d") +
  theme_classic()+
  scale_x_continuous(breaks=breaks_fun)+
  scale_y_continuous(breaks=seq(0.2,1,0.1))


savename <- "model2plausibility_EventsAdapt_active-passive_ceiled.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm', device='tiff', dpi=700)

TrialTypes

dataset = "EventsAdapt"

ggplot(data = dat %>% filter(Dataset==dataset, VoiceType=="active-active", !grepl('AAR', TrialType)),
       mapping = aes(x=Layer, y=Accuracy, color=TrialType, group=TrialType))+
  facet_wrap(~Model, scales="free_x")+
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0)+
  theme_classic()

savename <- "model2plausibility_EventsAdapt_AI-AAN.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm')

###Attempt with human ceiling

dataset = "EventsAdapt"
plot_data = merged %>% filter(Dataset==dataset, VoiceType=="active-active", !grepl('AAR', TrialType),Plot!="HumanCeiling")
ceil_data = merged %>% filter(VoiceType=="active-active",Plot=="HumanCeiling",!grepl('AAR', TrialType))

ceil_summary <- ceil_data %>%
  group_by(TrialType) %>%
  summarise(
    MeanAccuracy = mean(Accuracy),
    sd = sd(Accuracy),
    n = n(),
    se = sd / sqrt(n)
  )

trial_types = c(ceil_summary$TrialType)

ceil_data = ceil_data %>%
  mutate(MeanAccuracy = NA) %>%
  mutate(SE = NA)

for( tt in trial_types){
  ceil_data = ceil_data %>%
    mutate(MeanAccuracy = ifelse(ceil_data$TrialType==tt, subset(ceil_summary, TrialType == tt)$MeanAccuracy, ceil_data$MeanAccuracy)) %>%
    mutate(SE = ifelse(ceil_data$TrialType==tt, subset(ceil_summary, TrialType == tt)$se, ceil_data$SE))
}

ggplot(data = plot_data,
       mapping = aes(x=Layer, y=Accuracy, color=TrialType, group=TrialType))+
  facet_grid(~Model, scales="free_x")+
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0.1)+
  geom_point(data = ceil_data, aes(x=Layer, y=MeanAccuracy, color=TrialType), size=0.2)+
  geom_errorbar(data = ceil_data, aes(x=Layer, ymin=MeanAccuracy-SE, ymax=MeanAccuracy+SE, color=TrialType), width = 0.05) +
  geom_text(data = ceil_data, group=ceil_data$TrialType, x =ifelse(ceil_data$Model=="gpt2-xl", ceil_data$Layer - 6, ceil_data$Layer - 3),  y = 0.98,
        size = 2.5,
       label = "ceiling", 
       colour = "#6d6d6d") +
  theme_classic()+
  scale_x_continuous(breaks=breaks_fun)+
  scale_y_continuous(breaks=seq(0.5,1,0.1))


savename <- "model2plausibility_EventsAdapt_AI-AAN_ceiled.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm', device='tiff', dpi=700)
---
title: "Probing results"
output: html_notebook
---

# SETUP

```{r setup, include=FALSE, echo=TRUE}
knitr::opts_chunk$set(echo = TRUE)
rm(list=ls()) 
library(ggplot2)
library(dplyr)
library(tidyr)
library(stringr)
library(grid)
library(gridExtra)
library(operator.tools)
library(lme4)
library(lmerTest)
library(cocor)
library(patchwork)
library(gtools)

source('dataloader_utils.R') #includes normalizations, read_data functions
source('stats_utils.R')

# Suppress summarise info
options(dplyr.summarise.inform = FALSE)
```

## Output dir
```{r}
path <- paste("results/")
ifelse(!dir.exists(path), dir.create(path), FALSE)
savedir <- paste(path,"Probing",sep='')
ifelse(!dir.exists(savedir), dir.create(savedir), FALSE)
```


# READ DATA
```{r}
model2plausibility_dir = "../probing/results/model2plausibility"
datafile = "all_combined_kfold_new.csv"

dat = read.csv(paste(model2plausibility_dir, datafile, sep="/"))


dat <- subset(dat, select = -X)

dat$Layer = as.factor(dat$Layer)
dat$Model = factor(dat$Model, levels=c("bert-large-cased", "roberta-large", "gpt2-xl"))
```

# STATS

```{r}
dat.DTFit = dat %>% filter(Dataset=="DTFit")

m.DTFit = lm(Accuracy~Layer:Model, data=dat.DTFit)
summary(m.DTFit)
```

# READ HUMAN CEILING DATA
```{r}
dat$Layer = as.numeric(as.character(dat$Layer))


model2plausibility_dir = "../probing/results/model2human_ceiling"

read_data <- function(directory, filename) {
  print(paste(filename))
  metadata = str_split(filename, "_")[[1]]
  d = read.delim(paste(model2plausibility_dir, filename, sep='/'), 
                 header=FALSE, sep=',')
  d = d %>%
    mutate(Dataset = metadata[[2]]) %>%
    mutate(VoiceType = metadata[[3]]) %>%
    mutate(TrialType = metadata[[4]]) %>%
    mutate(TrialType = str_replace(TrialType, ".csv", "")) %>%
    rename(Iteration = V1)  %>%
    rename(Accuracy = V2)  %>%
    mutate(Plot = "HumanCeiling")
}

filenames = list.files(path=model2plausibility_dir, pattern='*.csv')
dat.ceiling = do.call(rbind, lapply(filenames, function(x) read_data(model2plausibility_dir, x)))

add_info <- function(dataframe, model_name) {
  if (grepl("gpt2-xl",model_name) == FALSE){
    toadd = 5
  }else{
    toadd = 10
  }
  dataframe = dataframe %>%
    mutate(Layer = max(subset(dat, Model==model_name)$Layer) + toadd) %>%
    mutate(Model = model_name)
}

models = unique(c(dat$Model))
  
dat.ceiling.full = do.call(rbind, lapply(models, function(x) add_info(dat.ceiling, x)))
```
```{r}
merged <- rbind(dat, dat.ceiling.full)
```

# PLOT

## breaks function x axis
```{r}
breaks_fun <- function(x) {
  if (max(x) > 40) {
    c(seq(0, 50, 10), max(x) + 10)
  } else {
    c(seq(0, 25, 5), max(x) + 5)
  }
}

```

## All datasets
```{r, fig.width=15, fig.height=8}
plot_data = merged %>% filter(TrialType=="normal", VoiceType=="normal",Plot!="HumanCeiling")
plot_data$Layer = as.numeric(as.character(plot_data$Layer))

ceil_data = merged %>% filter(TrialType=="normal", VoiceType=="normal",Plot=="HumanCeiling")
ceil_summary <- ceil_data %>%
  group_by(Dataset) %>%
  summarise(
    MeanAccuracy = mean(Accuracy),
    sd = sd(Accuracy),
    n = n(),
    SE = sd / sqrt(n)
  )

datasets = c(ceil_summary$Dataset)

ceil_data = ceil_data %>% 
  mutate(MeanAccuracy = NA) %>%
  mutate(SE = NA)

for( dt in datasets){
  ceil_data = ceil_data %>%
    mutate(MeanAccuracy = ifelse(ceil_data$Dataset==dt, subset(ceil_summary, Dataset == dt)$MeanAccuracy, ceil_data$MeanAccuracy)) %>%
    mutate(SE = ifelse(ceil_data$Dataset==dt, subset(ceil_summary, Dataset == dt)$SE, ceil_data$SE))
}

ggplot(data = plot_data,
       mapping = aes(x=Layer, y=Accuracy, color=Dataset, group = Dataset))+
  facet_grid(~Model, scales="free_x")+
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0.1)+
  geom_point(data = ceil_data, aes(x=Layer, y=MeanAccuracy, color=Dataset), size=0.2)+
  geom_errorbar(data = ceil_data, aes(x=Layer, ymin=MeanAccuracy-SE, ymax=MeanAccuracy+SE, color=Dataset),width = 0.05) +
  geom_text(data = ceil_data, group=ceil_data$Dataset, x =ifelse(ceil_data$Model=="gpt2-xl", ceil_data$Layer - 6, ceil_data$Layer - 3),  y = 0.98,
            size = 2.5,
            label = "ceiling", 
            colour = "#6d6d6d") +
  theme_classic()+
  scale_x_continuous(breaks=breaks_fun)+
  scale_y_continuous(breaks=seq(0.4,1.2,0.1))

savename <- "model2plausibility_all_datasets_ceiled.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm', device='tiff', dpi=700)
```

## DTFit
```{r}
dataset = "DTFit"

ggplot(data = dat %>% filter(Dataset==dataset),
       mapping = aes(x=Layer, y=Accuracy, group=dataset))+
  facet_wrap(~Plot, scales="free_x")+
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               color = 'black', size = 0.5, width=0.1)+
  theme_classic()

savename <- "model2plausibility_DTFit.png"
ggsave(paste(savedir,savename,sep="/"), width=16, height=8, units='cm')
```

## EventsRev
```{r}
dataset = "EventsRev"

ggplot(data = dat %>% filter(Dataset==dataset),
       mapping = aes(x=Layer, y=Accuracy, group=dataset))+
  facet_grid(~Plot, scales="free_x")+
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               color = 'black', size = 0.5, width=0.1)+
  theme_classic()

savename <- "model2plausibility_EventsRev.png"
ggsave(paste(savedir,savename,sep="/"), width=16, height=8, units='cm')
```

## EventsAdapt

### Active Passive
```{r}
dataset = "EventsAdapt"

ggplot(data = dat %>% filter(Dataset==dataset, TrialType=="normal"),
       mapping = aes(x=Layer, y=Accuracy, color=VoiceType, group=VoiceType))+
  facet_grid(~Model, scales="free_x")+ 
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0)+
  theme_classic()

savename <- "model2plausibility_EventsAdapt_active-passive.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm')
```

###Attempt with human ceiling
```{r, fig.width=15, fig.height=8}
dataset = "EventsAdapt"
plot_data = merged %>% filter(Dataset==dataset, TrialType=="normal",Plot!="HumanCeiling")

ceil_data = merged %>% filter(TrialType=="normal",Plot=="HumanCeiling")
ceil_summary <- ceil_data %>%
  group_by(VoiceType) %>%
  summarise(
    MeanAccuracy = mean(Accuracy),
    sd = sd(Accuracy),
    n = n(),
    se = sd / sqrt(n)
  )

voice_types = c(ceil_summary$VoiceType)

ceil_data = ceil_data %>%
  mutate(MeanAccuracy = NA) %>%
  mutate(SE = NA)

for( vt in voice_types){
  ceil_data = ceil_data %>%
    mutate(MeanAccuracy = ifelse(ceil_data$VoiceType==vt, subset(ceil_summary, VoiceType == vt)$MeanAccuracy, ceil_data$MeanAccuracy)) %>%
    mutate(SE = ifelse(ceil_data$VoiceType==vt, subset(ceil_summary, VoiceType == vt)$se, ceil_data$SE))
}

ggplot(data = plot_data,
       mapping = aes(x=Layer, y=Accuracy, color=VoiceType, group=VoiceType))+
  facet_grid(~Model, scales="free_x")+
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0.1)+
  geom_point(inherit.aes=FALSE, data = ceil_data, aes(x=Layer, y=MeanAccuracy, color=VoiceType), size=0.2)+
  geom_errorbar(inherit.aes=FALSE, data = ceil_data, aes(x=Layer, ymin=MeanAccuracy-SE, ymax=MeanAccuracy+SE, color=VoiceType),width = 0.05) +
    geom_text(data = ceil_data, group=ceil_data$VoiceType, x =ifelse(ceil_data$Model=="gpt2-xl", ceil_data$Layer - 6, ceil_data$Layer - 3),  y = 0.98,
            size = 2.5,
           label = "ceiling", 
           colour = "#6d6d6d") +
  theme_classic()+
  scale_x_continuous(breaks=breaks_fun)+
  scale_y_continuous(breaks=seq(0.2,1,0.1))

savename <- "model2plausibility_EventsAdapt_active-passive_ceiled.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm', device='tiff', dpi=700)
```

### TrialTypes
```{r}
dataset = "EventsAdapt"

ggplot(data = dat %>% filter(Dataset==dataset, VoiceType=="active-active", !grepl('AAR', TrialType)),
       mapping = aes(x=Layer, y=Accuracy, color=TrialType, group=TrialType))+
  facet_wrap(~Model, scales="free_x")+
  scale_x_continuous(breaks = breaks_fun, limits = c(0, NA)) + 
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0)+
  theme_classic()

savename <- "model2plausibility_EventsAdapt_AI-AAN.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm')
```

###Attempt with human ceiling
```{r, fig.width=15, fig.height=8}
dataset = "EventsAdapt"
plot_data = merged %>% filter(Dataset==dataset, VoiceType=="active-active", !grepl('AAR', TrialType),Plot!="HumanCeiling")
ceil_data = merged %>% filter(VoiceType=="active-active",Plot=="HumanCeiling",!grepl('AAR', TrialType))

ceil_summary <- ceil_data %>%
  group_by(TrialType) %>%
  summarise(
    MeanAccuracy = mean(Accuracy),
    sd = sd(Accuracy),
    n = n(),
    se = sd / sqrt(n)
  )

trial_types = c(ceil_summary$TrialType)

ceil_data = ceil_data %>%
  mutate(MeanAccuracy = NA) %>%
  mutate(SE = NA)

for( tt in trial_types){
  ceil_data = ceil_data %>%
    mutate(MeanAccuracy = ifelse(ceil_data$TrialType==tt, subset(ceil_summary, TrialType == tt)$MeanAccuracy, ceil_data$MeanAccuracy)) %>%
    mutate(SE = ifelse(ceil_data$TrialType==tt, subset(ceil_summary, TrialType == tt)$se, ceil_data$SE))
}

ggplot(data = plot_data,
       mapping = aes(x=Layer, y=Accuracy, color=TrialType, group=TrialType))+
  facet_grid(~Model, scales="free_x")+
  geom_hline(yintercept=.5, linetype='dotted')+
  geom_hline(yintercept=1, linetype='dotted')+
  stat_summary(geom='line', fun='mean')+
  stat_summary(geom='errorbar', fun.data='mean_se',
               size = 0.2, width=0.1)+
  geom_point(data = ceil_data, aes(x=Layer, y=MeanAccuracy, color=TrialType), size=0.2)+
  geom_errorbar(data = ceil_data, aes(x=Layer, ymin=MeanAccuracy-SE, ymax=MeanAccuracy+SE, color=TrialType), width = 0.05) +
  geom_text(data = ceil_data, group=ceil_data$TrialType, x =ifelse(ceil_data$Model=="gpt2-xl", ceil_data$Layer - 6, ceil_data$Layer - 3),  y = 0.98,
        size = 2.5,
       label = "ceiling", 
       colour = "#6d6d6d") +
  theme_classic()+
  scale_x_continuous(breaks=breaks_fun)+
  scale_y_continuous(breaks=seq(0.5,1,0.1))

savename <- "model2plausibility_EventsAdapt_AI-AAN_ceiled.png"
ggsave(paste(savedir,savename,sep="/"), width=20, height=8, units='cm', device='tiff', dpi=700)
```